This script demonstrates samples in the PBTA cluster by composition and cancer type using dimensionality reduction techniques, namely, PCA, t-SNE and UMAP.
# magrittr pipe
`%>%` <- dplyr::`%>%`
# Function to align metadata to data.frame with dimension reduction scores
reduction_fn <- function(name, id) {
name$ID <- id
metadata <- df2 %>%
dplyr::filter(Kids_First_Biospecimen_ID %in% name$ID) %>%
dplyr::filter(!duplicated(Kids_First_Biospecimen_ID))
name <- name %>%
dplyr::filter(ID %in% metadata$Kids_First_Biospecimen_ID)
name$type <- as.factor(metadata$disease_type_new)
name$composition <- as.factor(metadata$composition)
return(name)
}
# Create directory to hold the output plots.
if (!dir.exists("plots")) {
dir.create("plots")
}
# Read in dataset
df2 <- data.frame(readr::read_tsv(
file.path("..", "..", "data", "pbta-histologies.tsv")
))
## Parsed with column specification:
## cols(
## .default = col_character(),
## age_at_diagnosis = col_double()
## )
## See spec(...) for full column specifications.
# Read in kallisto expression data
exp_kallisto <- data.frame(readr::read_rds(
file.path("..", "..", "data", "pbta-gene-expression-kallisto.rds")
))
# Read in RSEM expression data
exp_rsem <- data.frame(readr::read_rds(
file.path("..", "..", "data", "pbta-gene-expression-rsem.fpkm.rds")
))
# Transform the non-numeric "gene_id" column into rownames
exp_kallisto <- exp_kallisto[, -1] %>%
dplyr::filter(!duplicated(gene_id)) %>%
tibble::column_to_rownames("gene_id")
exp_rsem <- exp_rsem %>%
tibble::column_to_rownames("gene_id") %>%
na.omit()
# Transpose the data
transposed_rsem_data <- t(exp_rsem)
transposed_kallisto_data <- t(exp_kallisto)
# Save rownames as a vector
rsem_ID <- rownames(transposed_rsem_data)
kallisto_ID <- rownames(transposed_kallisto_data)
# Run PCA on RSEM
rsem_pca <- prcomp(transposed_rsem_data)
# Make a data.frame with PCA scores
rsem_pca_data <- data.frame(rsem_pca$x[, 1:2])
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
rsem_pca_data <- reduction_fn(rsem_pca_data, rsem_ID)
# Plot the PCA scores and color by cancer type
rsem_pca_plot <- ggplot2::ggplot(rsem_pca_data,
ggplot2::aes(x = rsem_pca_data[, 1],
y = rsem_pca_data[, 2],
color = type)) +
ggplot2::geom_point() +
ggplot2::theme(legend.position = "none")
# Plot the PCA scores and color by composition
rsem_pca_plot_composition <- ggplot2::ggplot(rsem_pca_data,
ggplot2::aes(x = rsem_pca_data[, 1],
y = rsem_pca_data[, 2],
color = composition)) +
ggplot2::geom_point() +
ggplot2::theme(legend.position = "none")
# Run t-SNE on RSEM
rsem_tsne <- Rtsne::Rtsne(transposed_rsem_data)
# Make a data.frame with t-SNE scores
rsem_tsne_data <- data.frame(rsem_tsne$Y)
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
rsem_tsne_data <- reduction_fn(rsem_tsne_data, rsem_ID)
# Plot the t-SNE scores and color by cancer type
rsem_tsne_plot <- ggplot2::ggplot(rsem_tsne_data,
ggplot2::aes(x = rsem_tsne_data[, 1],
y = rsem_tsne_data[, 2],
color = type)) +
ggplot2::geom_point() +
ggplot2::theme(legend.position = "none")
# Plot the t-SNE scores and color by composition
rsem_tsne_plot_composition <- ggplot2::ggplot(rsem_tsne_data,
ggplot2::aes(x = rsem_tsne_data[, 1],
y = rsem_tsne_data[, 2],
color = composition)) +
ggplot2::geom_point() +
ggplot2::theme(legend.position = "none")
# Run UMAP on RSEM
rsem_umap <- umap::umap(transposed_rsem_data)
# Make a data.frame with umap scores
rsem_umap_data <- data.frame(rsem_umap$layout)
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
rsem_umap_data <- reduction_fn(rsem_umap_data, rsem_ID)
# Plot the umap scores and color by cancer type
rsem_umap_plot <- ggplot2::ggplot(rsem_umap_data,
ggplot2::aes(x = rsem_umap_data[, 1],
y = rsem_tsne_data[, 2],
color = type)) +
ggplot2::geom_point(size = 1) +
ggplot2::theme(legend.position = "bottom", legend.text = ggplot2::element_text(size = 3))
# Plot the umap scores and color by composition
rsem_umap_plot_composition <- ggplot2::ggplot(rsem_umap_data,
ggplot2::aes(x = rsem_umap_data[, 1],
y = rsem_tsne_data[, 2],
color = composition)) +
ggplot2::geom_point(size = 1) +
ggplot2::theme(legend.position = "none")
# Plot grid with RSEM data colored by composition vs tumor type
meta_grid_rsem <- gridExtra::grid.arrange(rsem_pca_plot_composition,
rsem_tsne_plot_composition,
rsem_umap_plot_composition,
rsem_pca_plot,
rsem_tsne_plot,
rsem_umap_plot,
ncol = 3,
top = "Composition and Cancer Types, respectively (RSEM)")
# Save grid
ggplot2::ggsave(file.path("plots", "meta_grid_rsem.pdf"), meta_grid_rsem, width = 16, height = 18)
# Run PCA on kallisto
kallisto_pca <- prcomp(transposed_kallisto_data)
# Make a data.frame with PCA scores
kallisto_pca_data <- data.frame(kallisto_pca$x[, 1:2])
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
kallisto_pca_data <- reduction_fn(kallisto_pca_data, kallisto_ID)
# Plot the PCA scores and color by cancer type
kallisto_pca_plot <- ggplot2::ggplot(kallisto_pca_data,
ggplot2::aes(x = kallisto_pca_data[, 1],
y = kallisto_pca_data[, 2],
color = type)) +
ggplot2::geom_point() +
ggplot2::theme(legend.position = "none")
# Plot the PCA scores and color by composition
kallisto_pca_plot_composition <- ggplot2::ggplot(kallisto_pca_data,
ggplot2::aes(x = kallisto_pca_data[, 1],
y = kallisto_pca_data[, 2],
color = composition)) +
ggplot2::geom_point() +
ggplot2::theme(legend.position = "none")
# Run t-SNE on kallisto
kallisto_tsne <- Rtsne::Rtsne(transposed_kallisto_data)
# Make a data.frame with t-SNE scores
kallisto_tsne_data <- data.frame(kallisto_tsne$Y)
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
kallisto_tsne_data <- reduction_fn(kallisto_tsne_data, kallisto_ID)
# Plot the t-SNE scores and color by cancer type
kallisto_tsne_plot <- ggplot2::ggplot(kallisto_tsne_data,
ggplot2::aes(x = kallisto_tsne_data[, 1],
y = kallisto_tsne_data[, 2],
color = type)) +
ggplot2::geom_point() +
ggplot2::theme(legend.position = "none")
# Plot the t-SNE scores and color by composition
kallisto_tsne_plot_composition <- ggplot2::ggplot(kallisto_tsne_data,
ggplot2::aes(x = kallisto_tsne_data[, 1],
y = kallisto_tsne_data[, 2],
color = composition)) +
ggplot2::geom_point() +
ggplot2::theme(legend.position = "none")
# Run UMAP on kallisto
kallisto_umap <- umap::umap(transposed_kallisto_data)
# Make a data.frame with umap scores
kallisto_umap_data <- data.frame(kallisto_umap$layout)
# Run the reduction_fn which aligns metadata and prepares data.frame for ggplot
kallisto_umap_data <- reduction_fn(kallisto_umap_data, kallisto_ID)
# Plot the umap scores and color by cancer type
kallisto_umap_plot <- ggplot2::ggplot(kallisto_umap_data,
ggplot2::aes(x = kallisto_umap_data[, 1],
y = kallisto_umap_data[, 2],
color = type)) +
ggplot2::geom_point(size = 1) +
ggplot2::theme(legend.position = "bottom", legend.text = ggplot2::element_text(size = 4))
# Plot the umap scores and color by composition
kallisto_umap_plot_composition <- ggplot2::ggplot(kallisto_umap_data,
ggplot2::aes(x = kallisto_umap_data[, 1],
y = kallisto_umap_data[, 2],
color = composition)) +
ggplot2::geom_point(size = 1) +
ggplot2::theme(legend.position = "bottom", legend.text = ggplot2::element_text(size = 4))
# Plot grid with kallisto data colored by composition vs tumor type
meta_grid_kallisto <- gridExtra::grid.arrange(kallisto_pca_plot_composition,
kallisto_tsne_plot_composition,
kallisto_umap_plot_composition,
kallisto_pca_plot,
kallisto_tsne_plot,
kallisto_umap_plot,
ncol = 3,
top = "Composition and Cancer Types, respectively (kallisto)")
# Save grid
ggplot2::ggsave(file.path("plots", "meta_grid_kallisto.pdf"), meta_grid_kallisto, width = 23, height = 19)